Load the Required Packages:

Below, the packages required for data analysis and visualization are loaded.

library(tidyverse)
library(magrittr)
library(DBI)
library(dbplyr)
library(RMariaDB)
library(data.table)
library(stopwords)
library(tidytext)
library(RColorBrewer)
library(DT)
library(wordcloud)

State the Research Question:

W. Edwards Deming said, “In God we trust, all others must bring data.” Below, we will use data to explore the question, “Which are the most valued data science skills?”

Connect to the SQL Data Base:

con <- DBI::dbConnect(
  RMariaDB::MariaDB(),
  dbname = "dat_sci_jobs",
  username = "root",
  password = as.character(read.table("sql_db.txt", header = FALSE)),
  host = "35.227.102.234")

Read from the SQL Data Base and Disconnect:

tables <- dbListTables(con)
jobs_df <- dbReadTable(con, "_Jobs")
datatable(jobs_df, options = list(pageLength = 25))
dbDisconnect(con)

Create a Data Frame of Jobs Found via RSS Feed By Combining CSV Files and Removing Duplicates:

completed_files <- readLines("completed_files.txt")

files <- list.files(pattern = "_feeds_.*csv$")
url_base <- "https://raw.githubusercontent.com/geedoubledee/data607_project3/main/"
new_jobs_df <- as.data.frame(matrix(nrow = 0, ncol = 11))
for (i in 1:length(files)){
    if (!(files[i] %in% completed_files)){
        file <- paste(url_base, files[i], sep = "")
        csv <- read.csv(file = file, header = TRUE)
        new_jobs_df <- rbind(new_jobs_df, csv)
        completed_files <- append(completed_files, files[i])
    }
}

writeLines(completed_files, "completed_files.txt")

new_jobs_df <- new_jobs_df[!duplicated(new_jobs_df), ]

Remove Unnecessary Columns in Jobs Data Frame and Rearrange Remaining Columns to Prepare the Data for the SQL Data Base:

if (nrow(new_jobs_df) > 0){
    new_jobs_df <- subset(new_jobs_df, select = -c(X, author, summary,
        content, extracted_content_url, published, created_at))
    cols <- c("Job_id", "Site_id", "Job_title", "Job_url")
    colnames(new_jobs_df) <- cols
    rownames(new_jobs_df) <- NULL
    new_jobs_df <- new_jobs_df[c("Job_id", "Job_title", "Job_url",
                                 "Site_id")]
    new_jobs_df %<>%
        mutate(Job_complete = 0)
    jobs_df <- rbindlist(list(jobs_df, new_jobs_df))[!duplicated(Job_url)]
    jobs_df %<>%
        mutate(Job_id = row_number())
}

Scrape Each Unique Job Listing URL:

for (i in 1:nrow(jobs_df)){
    httr::user_agent("Glen Davis")
    if (jobs_df[i, 5] == 0){
        dat <- try(xml2::read_html(jobs_df$Job_url[[i]]), silent = TRUE)
        if (inherits(dat, "try-error", which = FALSE)){
            jobs_df[i, 5] <- -1
            next
        }
    }else{
        next
    }
    if (jobs_df[i, 4] == 2594160){ #ai-jobs.net is source
        desc <- xml2::xml_find_all(
            dat, "//div[contains(@id, 'job-description')]")
    }
    else if (jobs_df[i, 4] == 977141){ #python.org is source
        desc <- xml2::xml_find_all(
            dat, "//div[contains(@class, 'job-description')]")
    }
    else if (jobs_df[i, 4] == 2594162){ #careercast it & eng is source
        desc <- xml2::xml_find_all(
            dat, "//div[contains(@class, 'arDetailDescriptionRow')]")
    }
    else if (jobs_df[i, 4] == 1378327){ #jobs for r-users is source
        desc <- xml2::xml_find_all(
            dat, "//div[contains(@class, 'section_content')]")
    }
    else if (jobs_df[i, 4] == 2593879){ #Indeed is source
        desc <- xml2::xml_find_all(
            dat, "//div[contains(@class, 'jobsearch-jobDescriptionText')]")
    }
    else if (jobs_df[i, 4] == 2594166){ #Open Data Science is source
        desc <- xml2::xml_find_all(
            dat, "//div[contains(@class, 'job-desc')]")
    }
    else if (jobs_df[i, 4] == 2594174){ #MLconf is source
        desc <- xml2::xml_find_all(
            dat, "//div[contains(@class, 'job_description')]")
    }
    desc <- xml2::xml_text(desc)
    fn <- paste(jobs_df[i, 1], ".txt", sep = "")
    writeLines(desc, con = fn)
    jobs_df[i, 5] <- 1
}

Connect to the SQL Data Base:

con <- DBI::dbConnect(
  RMariaDB::MariaDB(),
  dbname = "dat_sci_jobs",
  username = "root",
  password = as.character(read.table("sql_db.txt", header = FALSE)),
  host = "35.227.102.234")

Write to the SQL Data Base and Disconnect:

tables <- dbListTables(con)
dbWriteTable(con, "_Jobs", jobs_df, overwrite = TRUE)
dbDisconnect(con)

Analyze the CSV Job Descriptions with TidyText

files <- list.files(pattern = "^[1-9]+.*txt$")
completed_txt_files <- readLines("completed_txt_files.txt")
text_df <- read.csv("text_df.csv")

if (length(files) > 0){
    file.copy(from = paste0(getwd(), "/", files),
          to = paste0(getwd(), "/jobs-txt/", files))
    file.remove(from = paste0(getwd(), "/", files))
    url_base <- "https://raw.githubusercontent.com/geedoubledee/data607_project3/main/jobs-txt/"
    new_text_df <- as.data.frame(matrix(nrow = 0, ncol = 3))
    cols <- c("Text", "Job_id", "Line")
    colnames(new_text_df) <- cols
    for (i in 1:length(files)){
        if (!(files[i] %in% completed_txt_files)){
            file <- paste(url_base, files[i], sep = "")
            job_id <- str_replace(files[i], ".txt", "")
            lines <- readLines(file)
            for (j in 1:length(lines)){
                col2 <- matrix(job_id, nrow = length(lines), ncol = 1)
                col3 <- matrix(1:length(lines),
                               nrow = length(lines),
                               ncol = 1)
            }
            completed_txt_files <- append(completed_txt_files, files[i])
            addition <- cbind(lines, col2, col3)
            colnames(addition) <- cols
            text_df <- rbind(text_df, addition)
        }
    }
    writeLines(completed_txt_files, "completed_txt_files.txt")
    text_df %<>%
        mutate(Text_id = row_number())
    rownames(text_df) <- NULL
    text_df <- text_df[c("Text_id", "Text", "Job_id", "Line")]
    write.csv(text_df, "text_df.csv")
}

tidy_text_df_words <- text_df %>%
    unnest_tokens(word, Text)

tidy_text_words_analysis <- tidy_text_df_words %>%
    anti_join(get_stopwords()) %>%
    count(word, sort = TRUE)
## Joining with `by = join_by(word)`
datatable(tidy_text_words_analysis, options = list(pageLength = 25))
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html
tidy_text_words_analysis %>%
    with(wordcloud(word, n, max.words = 50))

tidy_text_df_bigrams <- text_df %>%
    unnest_tokens(bigram, Text, token = "ngrams", n = 2)

tidy_text_bigrams_analysis <- tidy_text_df_bigrams %>%
    count(bigram, sort = TRUE)

datatable(tidy_text_bigrams_analysis, options = list(pageLength = 25))
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html
tidy_text_bigrams_analysis %>%
    with(wordcloud(bigram, n, max.words = 50))